/*************************************************************************
MAKE_RD.SAS

 STEP 1: Import data
 STEP 2: Merge all data
 STEP 3: Create variables used for analysis    
*************************************************************************/

* Define the local directory in which you are working;
%LET LOCAL = /mnt/Research/RESTAT;
LIBNAME HERE "&LOCAL";
* IMS R&D Focus data should be put in a subdirectory called IMS_RD;
LIBNAME IMS "&LOCAL./IMS_RD";


/*************************************************************************
  STEP 1: Import data
    Latest download is March 2007.

   Files are named with the following format:
	Class%TCLASS%_Set_%SETN%_File_%FILEN%.csv

   TCLASS = therapeutic class
   SETN = set of variables (1-5)
   FILEN = set of records (blocks of 250)

   The output is a dataset with one line per project.  Where there
    are multiple classes, firms, etc. associated with a project, these
    are put in arrays.

   NOTES:
   Keep observations in their initial order for matching to the history
   data in the next step.
*************************************************************************/

* Macro to loop through all the files and create one SAS dataset per ATC1;
%MACRO NFILES(TCLASS,HOWMANY);

  * HOWMANY is the max of FILEN. In 2007, only 250 records could be exported at a time.
    In 2012, 1000 records could be exported at a time. The number of drug projects varies
    by ATC1 and over time, so HOWMANY will change for each download.;  
  %DO I = 1 %TO &HOWMANY;

   * These correspond to the 5 templates defined for downloading.;
     FILENAME IN1 "&LOCAL./IMS_RD/Class&TCLASS./Class&TCLASS._Set_1_File_&I..csv";
     FILENAME IN2 "&LOCAL./IMS_RD/Class&TCLASS./Class&TCLASS._Set_2_File_&I..csv";
     FILENAME IN3 "&LOCAL./IMS_RD/Class&TCLASS./Class&TCLASS._Set_3_File_&I..csv";
     FILENAME IN4 "&LOCAL./IMS_RD/Class&TCLASS./Class&TCLASS._Set_4_File_&I..csv";
     FILENAME IN5 "&LOCAL./IMS_RD/Class&TCLASS./Class&TCLASS._Set_5_File_&I..csv";

   * Read in each set of records;
     DATA DATA1_&I;
       INFILE IN1 DSD DELIMITER=',' FIRSTOBS=3 LRECL=50000;
       LENGTH PREFERRED_NAME $100;
       LENGTH PRODUCT_NAME $500;
       LENGTH COMPANY $500;
       LENGTH NATIONALITY $300;
       LENGTH LATEST_PHASE $20;
       LENGTH CLASS_CODE $50;
       LENGTH CLASS_DESCRIPTION $1000;
       LENGTH CAS_NUMBER $400;
       LENGTH ACTION $200;
       INPUT PREFERRED_NAME
             PRODUCT_NAME
             COMPANY
             NATIONALITY
             LATEST_PHASE
             CLASS_CODE
             CLASS_DESCRIPTION
             CAS_NUMBER
             ACTION;

     * This information is important for the creation of arrays later;
       ARRAY VARLIST_A{*} $ PREFERRED_NAME PRODUCT_NAME COMPANY NATIONALITY LATEST_PHASE 
	CLASS_CODE CLASS_DESCRIPTION CAS_NUMBER ACTION;
       ARRAY LENGTHS{*} LENGTH_1_1-LENGTH_1_9;
       ARRAY VARLIST_B{*} $ PRODUCT_NAME COMPANY CLASS_CODE CAS_NUMBER ACTION;
       ARRAY COUNTS{*} NUMBER_NAMES NUMBER_FIRMS NUMBER_CLASS NUMBER_CAS NUMBER_ACTION;
       DO I=1 TO DIM(VARLIST_A);
         LENGTHS{I} = LENGTH(VARLIST_A{I});
       END;
       DO I=1 TO DIM(VARLIST_B);
         COUNTS{I} = COUNTC(VARLIST_B{I},';') + 1;
         IF VARLIST_B{I} EQ '' THEN COUNTS{I}=0;
       END;

     DATA DATA2_&I;
       INFILE IN2 DSD DELIMITER=',' FIRSTOBS=3 LRECL=80000;
       LENGTH PREFERRED_NAME $100;
       LENGTH PRODUCT_NAME $500;
       LENGTH COMPANY $500;
       LENGTH LEAD_COMPANY $100;
       LENGTH FRANCHISE_COMPANY $20000;
       LENGTH FRANCHISE_COMPANY_NATIONALITY $20000;
       LENGTH FRANCHISE_CORP $2000;
       LENGTH FRANCHISE_CORP_NATIONALITY $2000;
       LENGTH FRANCHISE_RELATIONSHIP $2000;
       INPUT PREFERRED_NAME
             PRODUCT_NAME
             COMPANY
             LEAD_COMPANY
             FRANCHISE_COMPANY
   	     FRANCHISE_COMPANY_NATIONALITY
             FRANCHISE_CORP
   	     FRANCHISE_CORP_NATIONALITY
             FRANCHISE_RELATIONSHIP;
      * This information is important for the creation of arrays later;
        ARRAY VARLIST_A{*} $ PREFERRED_NAME PRODUCT_NAME COMPANY LEAD_COMPANY 
            FRANCHISE_COMPANY FRANCHISE_COMPANY_NATIONALITY FRANCHISE_CORP 
            FRANCHISE_CORP_NATIONALITY FRANCHISE_RELATIONSHIP;
        ARRAY LENGTHS{*} LENGTH_2_1-LENGTH_2_9;
        ARRAY VARLIST_B{*} $ FRANCHISE_COMPANY;
        ARRAY COUNTS{*} NUMBER_FRANCHISE;
        DO I=1 TO DIM(VARLIST_A);
          LENGTHS{I} = LENGTH(VARLIST_A{I});
        END;
        DO I=1 TO DIM(VARLIST_B);
          COUNTS{I} = COUNTC(VARLIST_B{I},';') + 1;
          IF VARLIST_B{I} EQ '' THEN COUNTS{I}=0;
        END;   
   
     DATA DATA3_&I;
       INFILE IN3 DSD DELIMITER=',' FIRSTOBS=3 LRECL=80000;
       LENGTH PREFERRED_NAME $100;
       LENGTH PRODUCT_NAME $500;
       LENGTH COMPANY $500;
       LENGTH PATENTEE $100;
       LENGTH PATENT_DATA $2000;
       LENGTH LATEST_PHASE $20;
       LENGTH DEV_SUMMARY $10000;
       LENGTH SCIENCE_SUMMARY $10000;
       LENGTH MODE_ADMIN $500;
       INPUT PREFERRED_NAME
             PRODUCT_NAME
             COMPANY
             PATENTEE
             PATENT_DATA
             LATEST_PHASE
             DEV_SUMMARY
             SCIENCE_SUMMARY
             MODE_ADMIN;
      * This information is important for the creation of arrays later;   
        ARRAY VARLIST_A{*} $ PREFERRED_NAME PRODUCT_NAME COMPANY PATENTEE 
   	PATENT_DATA LATEST_PHASE DEV_SUMMARY SCIENCE_SUMMARY MODE_ADMIN;
        ARRAY LENGTHS{*} LENGTH_3_1-LENGTH_3_9;
        ARRAY VARLIST_B{*} $ PATENTEE;
        ARRAY COUNTS{*} NUMBER_PATENTEES;
        DO I=1 TO DIM(VARLIST_A);
          LENGTHS{I} = LENGTH(VARLIST_A{I});
        END;
        DO I=1 TO DIM(VARLIST_B);
          COUNTS{I} = COUNTC(VARLIST_B{I},';') + 1;
          IF VARLIST_B{I} EQ '' THEN COUNTS{I}=0;
        END;
   
     DATA DATA4_&I;
       INFILE IN4 DSD DELIMITER=',' FIRSTOBS=3 LRECL=50000;
       LENGTH PREFERRED_NAME $100;
       LENGTH PRODUCT_NAME $500;
       LENGTH COMPANY $500;
       LENGTH COUNTRY_STATUS $1000;
       LENGTH COUNTRY_INDICATION $1000;
       LENGTH COUNTRY_PHASE $1000;
       LENGTH LAUNCH_COUNTRY $200;
       INPUT PREFERRED_NAME
             PRODUCT_NAME
             COMPANY
             COUNTRY_STATUS 
   	     COUNTRY_INDICATION
             COUNTRY_PHASE
             LAUNCH_COUNTRY;
     * This information is important for the creation of arrays later;   
       ARRAY VARLIST_A{*} $ PREFERRED_NAME PRODUCT_NAME COMPANY  
           COUNTRY_STATUS COUNTRY_INDICATION COUNTRY_PHASE LAUNCH_COUNTRY;
       ARRAY LENGTHS{*} LENGTH_4_1-LENGTH_4_7;
       ARRAY VARLIST_B{*} $  COUNTRY_STATUS;
       ARRAY COUNTS{*} NUMBER_COUNTRY;
       DO I=1 TO DIM(VARLIST_A);
         LENGTHS{I} = LENGTH(VARLIST_A{I});
       END;
       DO I=1 TO DIM(VARLIST_B);
         COUNTS{I} = COUNTC(VARLIST_B{I},';') + 1;
         IF VARLIST_B{I} EQ '' THEN COUNTS{I}=0;
       END;
   
     DATA DATA5_&I;
       INFILE IN5 DSD DELIMITER=',' FIRSTOBS=3 LRECL=80000;
       LENGTH PREFERRED_NAME $100;
       LENGTH PRODUCT_NAME $500;
       LENGTH COMPANY $500;
       LENGTH ACTIVE_PROGRAM $300;
       LENGTH ESTIMATED_LAUNCH $200;
       LENGTH INDICATION $2000;
       LENGTH HISTORY $20000;
       LENGTH LATEST_NEWS $20000;
       LENGTH UPDATE_DATE $10;
       INPUT PREFERRED_NAME
             PRODUCT_NAME
             COMPANY
             ACTIVE_PROGRAM
             ESTIMATED_LAUNCH
             INDICATION
             HISTORY
             LATEST_NEWS
             UPDATE_DATE;
     * This information is important for the creation of arrays later; 
       ARRAY VARLIST_A{*} $ PREFERRED_NAME PRODUCT_NAME COMPANY ACTIVE_PROGRAM 
           ESTIMATED_LAUNCH INDICATION HISTORY LATEST_NEWS UPDATE_DATE;
       ARRAY LENGTHS{*} LENGTH_5_1-LENGTH_5_9;
       ARRAY VARLIST_B{*} $ INDICATION HISTORY LATEST_NEWS UPDATE_DATE;
       ARRAY COUNTS{*} NUMBER_INDICATION NUMBER_HISTORY NUMBER_NEWS NUMBER_UPDATE;
       DO I=1 TO DIM(VARLIST_A);
         LENGTHS{I} = LENGTH(VARLIST_A{I});
       END;
       DO I=1 TO DIM(VARLIST_B);
         COUNTS{I} = COUNTC(VARLIST_B{I},';') + 1;
         IF VARLIST_B{I} EQ '' THEN COUNTS{I}=0;
       END;
   
   * Merge the 5 files here;
     DATA DATA&I;
       MERGE DATA1_&I (RENAME=(PREFERRED_NAME=NAME1)) 
   	  DATA2_&I (RENAME=(PREFERRED_NAME=NAME2))
   	  DATA3_&I (RENAME=(PREFERRED_NAME=NAME3)) 
   	  DATA4_&I (RENAME=(PREFERRED_NAME=NAME4)) 
   	  DATA5_&I (RENAME=(PREFERRED_NAME=NAME5));
     * Check on names to make sure match is correct;
       IF NAME1 NE NAME2 THEN PUT NAME1= NAME2=;
       IF NAME1 NE NAME3 THEN PUT NAME1= NAME3=;
       IF NAME1 NE NAME4 THEN PUT NAME1= NAME4=;
       IF NAME1 NE NAME5 THEN PUT NAME1= NAME5=;
       PREFERRED_NAME=NAME1;
       DROP NAME1-NAME5;
   
     * Create or append to ATC1 dataset and clean up;  
       %IF &I EQ 1 %THEN %DO;
   	DATA HERE.CLASS&TCLASS; SET DATA&I;
       %END;
       %ELSE %DO;
   	PROC APPEND BASE=HERE.CLASS&TCLASS DATA=DATA&I;
       %END;
       PROC DATASETS LIBRARY=WORK; DELETE DATA1_&I DATA2_&I DATA3_&I DATA4_&I DATA5_&I;
   
     %END; * Ends loop over files with blocks of observations;
   
     DATA HERE.CLASS&TCLASS;
       SET HERE.CLASS&TCLASS; 
   
     * Convert update date to a SAS date;
       DATE_UPDATE = MDY(SUBSTR(UPDATE_DATE,4,2),SUBSTR(UPDATE_DATE,1,2),SUBSTR(UPDATE_DATE,7,4));
   
     * Arrays of variables;
       %MAKEARRAYS(PRODUCT_NAME,NAMES,18);
       %MAKEARRAYS(COMPANY,FIRMS,32);
       %MAKEARRAYS(CLASS_CODE,CLASSES,9);
       %MAKEARRAYS(CAS_NUMBER,CAS_NOS,31);
       %MAKEARRAYS(ACTION,ACTIONS,15);
       %MAKEARRAYS(FRANCHISE_COMPANY,F_COMPANY,30);
       %MAKEARRAYS(FRANCHISE_COMPANY_NATIONALITY,F_COMPANY_NATION,30);
       %MAKEARRAYS(FRANCHISE_CORP,F_CORP,30);
       %MAKEARRAYS(FRANCHISE_CORP_NATIONALITY,F_CORP_NATION,30);
       %MAKEARRAYS(FRANCHISE_RELATIONSHIP,F_RELATION,30);
       %MAKEARRAYS(PATENTEE,PATENT_HOLDER,2);
       %MAKEARRAYS(COUNTRY_STATUS,STATUS_C,103);
       %MAKEARRAYS(COUNTRY_PHASE,PHASE_C,103);
       %MAKEARRAYS(COUNTRY_INDICATION,INDICATION_C,103);
       %MAKEARRAYS(HISTORY,EVENT,80);
       %MAKEARRAYS(LATEST_NEWS,NEWS,8);
       %MAKEARRAYS(ESTIMATED_LAUNCH,LAUNCHES,8);
       %MAKEARRAYS(INDICATION,INDIC,22);
   
       DROP LENGTH_1_1-LENGTH_1_9 LENGTH_2_1-LENGTH_2_9 LENGTH_3_1-LENGTH_3_9 
            LENGTH_4_1-LENGTH_4_7 LENGTH_5_1-LENGTH_5_9 SCRAP START FINISH I;
   
     PROC CONTENTS DATA=HERE.CLASS&TCLASS;
     PROC MEANS DATA=HERE.CLASS&TCLASS;
   
   %MEND(NFILES);

* Macro to parse long text fields into arrays of variables;
%MACRO MAKEARRAYS(VARNAME,ARRAYNAME,ARRAYSIZE);
   ARRAY &ARRAYNAME(&ARRAYSIZE) $300 &VARNAME.1-&VARNAME.&ARRAYSIZE;
   START = 1; FINISH=1;
   LENGTH SCRAP $1000;
   SCRAP = TRIM(&VARNAME);
   I=1;
   DO WHILE (I LE &ARRAYSIZE);
       FINISH = INDEX(SCRAP,';')-1;
       IF FINISH LT 0 THEN DO;
           &ARRAYNAME{I} = SCRAP;
           I = &ARRAYSIZE+1;
       END;
       ELSE DO;
           &ARRAYNAME{I} = SUBSTR(SCRAP,1,FINISH);
           START = FINISH+3;
           SCRAP = TRIM(LEFT(SUBSTR(SCRAP,START)));
           I = I+1;
       END;
    END;
%MEND(MAKEARRAYS);

%NFILES(A,9);
%NFILES(B,4);
%NFILES(C,10);
%NFILES(D,4);
%NFILES(G,5);
%NFILES(H,1);
%NFILES(J,21);
%NFILES(L,28);
%NFILES(M,7);
%NFILES(N,17);
%NFILES(P,2);
%NFILES(R,6);
%NFILES(S,3);
%NFILES(T,2);
%NFILES(V,15);

RUN;


/************************************************************************
   This code reads converted html files from IMS R&D Focus for
   the history information, which could not be exported in the
   usual fashion.

   NOTES:

   Use convert_html and convert_spaces scripts to do the following:

   All history files within a therapeutic class are concatenated.
   Make sure that the order of observations is not changed so
   the files can be matched to the data from read_rd_focus.sas.
   A NEWLINE MUST BE ADDED AT THE BOTTOM OF THE FILE, OTHERWISE
    THE LAST OBSERVATION WILL NOT BE READ.

   Then the file is filtered through convert_html to strip off the
     html code.
   Then the file is run through these filters to make it manageable
     in SAS.

   sed -e 's/  /|/g' <file in> > <file out>
   sed -e 's/||/|/g' <file in> > <file out>
   sed -e 's/||/|/g' <file in> > <file out>

   ...until the file is no longer changing.  The final stripped file
   is named ClassX_History.txt.

   Because the file has a non-standard format, it is somewhat complicated
    to input it into SAS. The code searches for relevant variable names
    and then parses the text that follows the name.
************************************************************************/

    * Macro to loop through each ATC1;
%MACRO READHIST(TCLASS);
  FILENAME IN1 "&LOCAL./IMS_RD/Class&TCLASS./Class&TCLASS._History.txt";

  DATA HERE.HISTORY&TCLASS;
   INFILE IN1 LENGTH=LINELEN LRECL=80000 LINE=LINENUM;

   INPUT @'Update Date|' @;
   START=LINENUM;
   VARLEN = MIN(LINELEN,32000);
   VARLEN2 = MAX(LINELEN-32000,0);
   INPUT LONGSTRING ~& $VARYING32000. VARLEN @;
   IF VARLEN2 GT 0 THEN INPUT #START @(VARLEN+14) LONGSTRING2 ~& $VARYING32000. VARLEN2;

     LENGTH UPDATE $10;
     UPDATE = SUBSTR(LONGSTRING,1,10);
     LONGSTRING = SUBSTR(LONGSTRING,11);

     LENGTH TRADENAME $100;
     MARK2 = INDEX(LONGSTRING,'Trade Name|')+11;
     IF MARK2 GT 11 THEN DO;
       SCRAP =  SUBSTR(LONGSTRING,MARK2);
       MARK3 = INDEX(SCRAP,'|')-1;
       TRADENAME = TRIM(LEFT(SUBSTR(LONGSTRING,MARK2,MARK3)));
     END;

     LENGTH GENERICNAME $100;
     MARK2 = INDEX(LONGSTRING,'Generic Name|')+13;
     IF MARK2 GT 13 THEN DO;
       SCRAP =  SUBSTR(LONGSTRING,MARK2);
       MARK3 = INDEX(SCRAP,'|')-1;
       GENERICNAME = TRIM(LEFT(SUBSTR(LONGSTRING,MARK2,MARK3)));
     END;

     LENGTH LABCODE $100;
     MARK2 = INDEX(LONGSTRING,'Lab Code|')+9;
     IF MARK2 GT 9 THEN DO;
       SCRAP =  SUBSTR(LONGSTRING,MARK2);
       MARK3 = INDEX(SCRAP,'|')-1;
       LABCODE = TRIM(LEFT(SUBSTR(LONGSTRING,MARK2,MARK3)));
     END;

     LENGTH LATEST_NEWS $20000;
     MARK2 = INDEX(LONGSTRING,'Latest News|')+12;
     IF MARK2 GT 12 THEN DO;
       SCRAP =  SUBSTR(LONGSTRING,MARK2);
       MARK3 = INDEX(SCRAP,'|')-1;
       LATEST_NEWS = TRIM(LEFT(SUBSTR(LONGSTRING,MARK2,MARK3)));
     END;

     LENGTH COMPANY $500;
     MARK2 = INDEX(LONGSTRING,'Company|')+8;
     IF MARK2 GT 8 THEN DO;
       SCRAP =  SUBSTR(LONGSTRING,MARK2);
       MARK3 = INDEX(SCRAP,'|')-1;
       COMPANY = TRIM(LEFT(SUBSTR(LONGSTRING,MARK2,MARK3)));
     END;

     LENGTH CHEMICALNAME $100;
     MARK2 = INDEX(LONGSTRING,'Chemical Name|')+14;
     IF MARK2 GT 14 THEN DO;
       SCRAP =  SUBSTR(LONGSTRING,MARK2);
       MARK3 = INDEX(SCRAP,'|')-1;
       CHEMICALNAME = TRIM(LEFT(SUBSTR(LONGSTRING,MARK2,MARK3)));
     END;

     LENGTH HISTORYSTRING $20000;
     MARK2 = INDEX(LONGSTRING,'History|')+8;
     IF MARK2 GT 8 THEN DO;
       HISTORYSTRING = SUBSTR(LONGSTRING,MARK2);
       IF VARLEN2 GT 0 THEN HISTORYSTRING = TRIM(LEFT(HISTORYSTRING))||TRIM(LEFT(LONGSTRING2));
     END;
     ELSE DO;
       MARK2 = INDEX(LONGSTRING2,'History|')+8;
       IF MARK2 GT 8 THEN HISTORYSTRING = SUBSTR(LONGSTRING2,MARK2);
     END;


  /* Parse history */

   ARRAY DATE{*} $20 EVENTDATE1-EVENTDATE80;
   ARRAY EVENT{*} $300 EVENT1-EVENT80;
   ARRAY EVENTYY{*} EVENTYEAR1-EVENTYEAR80;
   ARRAY EVENTQ{*} EVENTQTR1-EVENTQTR80;
   ARRAY EVENTM{*} EVENTMON1-EVENTMON80;
   I = 0;
   DO WHILE (INDEX(HISTORYSTRING,'|') GT 0);
      I = I+1;
      MARK1 = INDEX(HISTORYSTRING,'|')-1;
      DATE{I} = TRIM(LEFT(SUBSTR(HISTORYSTRING,1,MARK1)));
      HISTORYSTRING = SUBSTR(HISTORYSTRING,MARK1+2);
      MARK2 = INDEX(HISTORYSTRING,'|')-1;
      EVENT{I} = UPCASE(TRIM(LEFT(SUBSTR(HISTORYSTRING,1,MARK2))));
      HISTORYSTRING = SUBSTR(HISTORYSTRING,MARK2+2);
	  * Correct some common misspellings;
      EVENT{I} = TRANWRD(EVENT{I},'AGEEMENT','AGREEMENT');
      EVENT{I} = TRANWRD(EVENT{I},'AGREEEMENT','AGREEMENT');
      EVENT{I} = TRANWRD(EVENT{I},'AGEEMEENT','AGREEMENT');
      EVENT{I} = TRANWRD(EVENT{I},'AGREEMEENT','AGREEMENT');
      EVENT{I} = TRANWRD(EVENT{I},'AGREETMENT','AGREEMENT');
      EVENT{I} = TRANWRD(EVENT{I},'BETEEN','BETWEEN');
      EVENT{I} = TRANWRD(EVENT{I},'BETWEEEN','BETWEEN');
      EVENT{I} = TRANWRD(EVENT{I},'BETWEEM','BETWEEN');
      EVENT{I} = TRANWRD(EVENT{I},'BETWEEB','BETWEEN');
      EVENT{I} = TRANWRD(EVENT{I},'LICENCE','LICENSE');
      EVENT{I} = TRANWRD(EVENT{I},'LICENCING','LICENSING');
      EVENT{I} = TRANWRD(EVENT{I},'AVALIABLE','AVAILABLE');
      EVENT{I} = TRANWRD(EVENT{I},'AVALABLE','AVAILABLE');
      EVENT{I} = TRANWRD(EVENT{I},'AVAILABE','AVAILABLE');
      EVENT{I} = TRANWRD(EVENT{I},'AVAIABLE','AVAILABLE');
      EVENT{I} = TRANWRD(EVENT{I},'DICONTINUED','DISCONTINUED');
      EVENT{I} = TRANWRD(EVENT{I},'MEDICIANL','MEDICINAL');
      EVENT{I} = TRANWRD(EVENT{I},'PRECLINAL','PRECLINICAL');
      EVENT{I} = TRANWRD(EVENT{I},'PRECINICAL','PRECLINICAL');
      EVENT{I} = TRANWRD(EVENT{I},'PRECLINCAL','PRECLINICAL');
      EVENT{I} = TRANWRD(EVENT{I},'PRECLINCIAL','PRECLINICAL');
      EVENT{I} = TRANWRD(EVENT{I},'PRECLINIAL','PRECLINICAL');
      EVENT{I} = TRANWRD(EVENT{I},'PRECLINICA','PRECLINICAL');
      EVENT{I} = TRANWRD(EVENT{I},'PRECLINICL','PRECLINICAL');
      EVENT{I} = TRANWRD(EVENT{I},'PRELINICAL','PRECLINICAL');
      EVENT{I} = TRANWRD(EVENT{I},'PRECLINICALL','PRECLINICAL');
      EVENT{I} = TRANWRD(EVENT{I},'RECOMENDED','RECOMMENDED');
      EVENT{I} = TRANWRD(EVENT{I},'DISCONTINED','DISCONTINUED');
      EVENT{I} = TRANWRD(EVENT{I},'DICONTINUED','DISCONTINUED');
      EVENT{I} = TRANWRD(EVENT{I},'WTH','WITH');
      EVENT{I} = TRANWRD(EVENT{I},'AVAILALBLE','AVAILABLE');
      EVENT{I} = TRANWRD(EVENT{I},'PRECLNICAL','PRECLINICAL');
      EVENT{I} = TRANWRD(EVENT{I},'TRANSFERED','TRANSFERRED');

      LENGTH TEXTYEAR $4;
      IF DATE{I} IN('DATE UKNOWN','DATE UNKOWN') THEN DATE{I}='DATE UNKNOWN';
      IF INDEX(DATE{I},'UNKNOWN') LT 1 THEN
        DO T=1940 TO 2007;
           TEXTYEAR = PUT(T,4.);
           IF INDEX(DATE{I},TEXTYEAR) GT 0 THEN DO;
    	     EVENTYEAR=T;
    	     T=2007;
           END;
        END;
      IF EVENTYEAR EQ . & INDEX(DATE{I},'96') GT 0 THEN EVENTYEAR=1996;
      ELSE IF EVENTYEAR EQ . & INDEX(DATE{I},'206') GT 0 THEN EVENTYEAR=2006;
      ELSE IF EVENTYEAR EQ . & INDEX(DATE{I},'005') GT 0 THEN EVENTYEAR=2005;
      ELSE IF EVENTYEAR EQ . & INDEX(DATE{I},'194') GT 0 THEN EVENTYEAR=1994;
      ELSE IF EVENTYEAR EQ . & INDEX(DATE{I},'93') GT 0 THEN EVENTYEAR=1993;
      ELSE IF EVENTYEAR EQ . & INDEX(DATE{I},'04') GT 0 THEN EVENTYEAR=2004;
      ELSE IF EVENTYEAR EQ . & INDEX(DATE{I},'2995') GT 0 THEN EVENTYEAR=1995;
      ELSE IF EVENTYEAR EQ . & INDEX(DATE{I},'2O03') GT 0 THEN EVENTYEAR=2003;
      ELSE IF EVENTYEAR EQ . & INDEX(DATE{I},'199O') GT 0 THEN EVENTYEAR=1990;
      * Remaining years should be checked for mistakes;
    
      IF INDEX(DATE{I},'JAN') GT 0 THEN EVENTMONTH=1;
      ELSE IF INDEX(DATE{I},'FEB') GT 0 THEN EVENTMONTH=2;
      ELSE IF INDEX(DATE{I},'MAR') GT 0 THEN EVENTMONTH=3;
      ELSE IF INDEX(DATE{I},'APR') GT 0 THEN EVENTMONTH=4;
      ELSE IF INDEX(DATE{I},'MAY') GT 0 THEN EVENTMONTH=5;
      ELSE IF INDEX(DATE{I},'JUN') GT 0 THEN EVENTMONTH=6;
      ELSE IF INDEX(DATE{I},'JUL') GT 0 THEN EVENTMONTH=7;
      ELSE IF INDEX(DATE{I},'AUG') GT 0 THEN EVENTMONTH=8;
      ELSE IF INDEX(DATE{I},'SEP') GT 0 THEN EVENTMONTH=9;
      ELSE IF INDEX(DATE{I},'OCT') GT 0 THEN EVENTMONTH=10;
      ELSE IF INDEX(DATE{I},'NOV') GT 0 THEN EVENTMONTH=11;
      ELSE IF INDEX(DATE{I},'DEC') GT 0 THEN EVENTMONTH=12;
    
      IF INDEX(DATE{I},'Q1') GT 0 THEN EVENTQTR=1;
      ELSE IF INDEX(DATE{I},'Q2') GT 0 THEN EVENTQTR=2;
      ELSE IF INDEX(DATE{I},'Q3') GT 0 THEN EVENTQTR=3;
      ELSE IF INDEX(DATE{I},'Q4') GT 0 THEN EVENTQTR=4;
      ELSE IF INDEX(DATE{I},'1Q') GT 0 THEN EVENTQTR=1;
      ELSE IF INDEX(DATE{I},'2Q') GT 0 THEN EVENTQTR=2;
      ELSE IF INDEX(DATE{I},'3Q') GT 0 THEN EVENTQTR=3;
      ELSE IF INDEX(DATE{I},'4Q') GT 0 THEN EVENTQTR=4;
      ELSE IF INDEX(DATE{I},'1H') GT 0 THEN EVENTQTR=2;
      ELSE IF INDEX(DATE{I},'2H') GT 0 THEN EVENTQTR=4;
    
      IF EVENTMONTH LT 4 & EVENTMONTH NE . THEN EVENTQTR=1;
      ELSE IF EVENTMONTH LT 7 & EVENTMONTH NE . THEN EVENTQTR=2;
      ELSE IF EVENTMONTH LT 10 & EVENTMONTH NE . THEN EVENTQTR=3;
      ELSE IF EVENTMONTH GE 10 & EVENTMONTH NE . THEN EVENTQTR=4;
    
      EVENTYY{I}=EVENTYEAR;
      EVENTM{I}=EVENTMONTH;
      EVENTQ{I}=EVENTQTR;

      IF EVENTQTR EQ . AND I LT 80 THEN DO;
         IF EVENTYY{I} EQ EVENTYY{I+1} AND EVENTQ{I+1} LT 4 THEN EVENTQ{I}=EVENTQ{I+1}+1;
         ELSE IF EVENTYY{I} EQ EVENTYY{I+1} AND EVENTQ{I+1} EQ 4 THEN EVENTQ{I}=4;
         ELSE IF EVENTYY{I} GT EVENTYY{I+1} THEN EVENTQ{I}=2;
      END;

      EVENTYEAR = .;
      EVENTMONTH = .;
      EVENTQTR = .;
  
   END;
   N_EVENTS = I;
   
  * Convert update date to a SAS date;
  DATE_UPDATE = MDY(SUBSTR(UPDATE,4,2),SUBSTR(UPDATE,1,2),SUBSTR(UPDATE,7,4));
  UPDATE_YEAR = YEAR(DATE_UPDATE);

  /* Parse firm names */
	ARRAY FIRM(30) $100 COMPANY1-COMPANY30;
	START = 1;
	SCRAP = TRIM(COMPANY);
        I=1;
	DO WHILE (I LE 30);
	   FINISH = INDEX(SCRAP,')');
	   IF FINISH LE 0 THEN DO;
	     FIRM{I} = SCRAP;
	     I=31;
	   END;
	   ELSE DO;
	     FIRM{I} = SUBSTR(SCRAP,1,FINISH);
	     FIRM{I} = TRIM(LEFT(FIRM{I}));
	     IF SUBSTR(FIRM{I},1,1) EQ '(' THEN FIRM{I}='';
	       ELSE FIRM{I}=SUBSTR(FIRM{I},1,INDEX(FIRM{I},'(')-1);
	     START = FINISH+1;
	     SCRAP = TRIM(LEFT(SUBSTR(SCRAP,START)));
	   END;
   	   I = I+1;
	END;

   DROP MARK1-MARK3 I LONGSTRING SCRAP VARLEN VARLEN2 LONGSTRING START TEXTYEAR T
	EVENTMONTH EVENTQTR EVENTYEAR START FINISH;


PROC FREQ DATA=HERE.HISTORY&TCLASS;
  TABLES N_EVENTS;
  TABLES UPDATE_YEAR;

%MEND(READHIST);

%READHIST(A);
%READHIST(B);
%READHIST(C);
%READHIST(D);
%READHIST(G);
%READHIST(H);
%READHIST(J);
%READHIST(L);
%READHIST(M);
%READHIST(N);
%READHIST(P);
%READHIST(R);
%READHIST(S);
%READHIST(T);
%READHIST(V);

RUN;

/*************************************************************************
  STEP 2: Merge files
    Code to merge R&D Focus data with the history information.
*************************************************************************/

%MACRO MERGECLASS(TCLASS);

DATA HERE.MATCHED_&TCLASS;
  * Note that since there is no index created or common variable, this is not a
    match merge. It is essential that the order of exported records is
    maintained so that observations line up correctly.;
  MERGE HERE.CLASS&TCLASS (DROP=HISTORY1-HISTORY80) HERE.HISTORY&TCLASS;

  PROC PRINT DATA=HERE.MATCHED_&TCLASS (OBS=100);
    VAR PREFERRED_NAME TRADENAME GENERICNAME COMPANY LEAD_COMPANY;

%MEND(MERGECLASS);

%MERGECLASS(A);
%MERGECLASS(B);
%MERGECLASS(C);
%MERGECLASS(D);
%MERGECLASS(G);
%MERGECLASS(H);
%MERGECLASS(J);
%MERGECLASS(L);
%MERGECLASS(M);
%MERGECLASS(N);
%MERGECLASS(P);
%MERGECLASS(R);
%MERGECLASS(S);
%MERGECLASS(T);
%MERGECLASS(V);

* Final merged dataset of all ATC1s;
DATA HERE.MATCHED_RD_HISTORY;
 SET HERE.MATCHED_A;

PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_B;
PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_C;
PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_D;
PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_G;
PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_J;
PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_H;
PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_L;
PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_M;
PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_N;
PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_P;
PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_R;
PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_S;
PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_T;
PROC APPEND BASE=HERE.MATCHED_RD_HISTORY DATA=HERE.MATCHED_V;


/*************************************************************************
  STEP 3: Create variables
    This code takes the IMS R&D Focus data and defines the year in which
    each project observation started various phases of development.
    
    Each indication from the IMS R&D Focus data is assigned WHO code that
    corresponds to an ICD9 or ICD10 in the WHO Mortality data.

    Neglected disease definitions come from Moran et al (2009) paper.

    This program requires the file disease_assignment_code, which maps
    the indications to WHO codes.

    The output is a file with one observation per disease-year, and a count
    of new projects in Phases I-III.
    
*************************************************************************/

* Eliminate duplicate records, which result from the fact that each project
  may be assigned to multiple ATCs, and therefore exported multiple times.;

  PROC SORT NODUP DATA=HERE.MATCHED_RD_HISTORY;
    BY PREFERRED_NAME COMPANY1 LATEST_NEWS;
  PROC CONTENTS DATA=HERE.MATCHED_RD_HISTORY;
  
  DATA HERE.MATCHED_RD_HISTORY;
    SET HERE.MATCHED_RD_HISTORY;
  * Create project identifier;
    PROJECT_ID = _N_;
  
* Code to characterize events, which is messy.;
  DATA HERE.PROJECTS (KEEP=
    YEAR_APPROVED YEAR_LAUNCHED
    INDICATION PROJECT_ID LAUNCH_COUNTRY NEGLECTED NEGLECTED_NONHIV NUMBER_FIRMS
    WHO_CODE YEAR_PRECLINICAL NUMBER_FRANCHISE
    YEAR_PHASE1 YEAR_PHASE2 YEAR_PHASE3 YEAR_LAUNCHED YEAR_DISCONTINUED PREFERRED_NAME);
    SET HERE.MATCHED_RD_HISTORY;
    LENGTH DEV_STATUS $30;

    ARRAY EVENT{*} EVENT1-EVENT80;
    ARRAY EVENTDATE{*} EVENTDATE1-EVENTDATE80;
    ARRAY EVENTYY{*} EVENTYEAR1-EVENTYEAR80;
    ARRAY EVENTQ{*} EVENTQTR1-EVENTQTR80;
    ARRAY EVENTM{*} EVENTMON1-EVENTMON80;
  * Loop backwards over events, since they are listed
    in reverse chronological order.;
    IF N_EVENTS NE . THEN DO I=N_EVENTS TO 1 BY -1;
       EVENTFIELD=UPCASE(EVENT{I});
       DATE=EVENTDATE{I};
       EVENTYEAR=EVENTYY{I};
       EVENTMONTH=EVENTM{I};
       EVENTQTR=EVENTQ{I};

     * Determine which events correspond to development status;
     IF INDEXW(EVENTFIELD,'LAUNCHED') GT 0 
   	THEN DEV_STATUS = 'LAUNCHED';
     ELSE IF INDEX(EVENTFIELD,'MARKETED,') GT 0 
   	THEN DEV_STATUS = 'LAUNCHED';   
     ELSE IF INDEX(EVENTFIELD,'DISCOVERY') GT 0 
   	THEN DEV_STATUS = 'DISCOVERY';
     ELSE IF INDEX(EVENTFIELD,'DISCLOSED') GT 0 
   	THEN DEV_STATUS = 'DISCOVERY';
     ELSE IF INDEX(EVENTFIELD,'LEAD') GT 0 
   	& INDEX(EVENTFIELD,'IDENTIFIED') GT 0
   	THEN DEV_STATUS = 'DISCOVERY';
     ELSE IF INDEX(EVENTFIELD,'LEAD') GT 0 
   	& INDEX(EVENTFIELD,'OPTIMIZATION') GT 0
   	THEN DEV_STATUS = 'DISCOVERY';
     ELSE IF INDEX(EVENTFIELD,'LEAD') GT 0 
   	& INDEX(EVENTFIELD,'SELECTED') GT 0
   	THEN DEV_STATUS = 'DISCOVERY';   
     ELSE IF INDEX(EVENTFIELD,'APPROVAL NOT RECOMMENDED') GT 0 
   	THEN DEV_STATUS = 'REGULATORY REJECTION';
     ELSE IF INDEX(EVENTFIELD,'APPROVAL DENIED') GT 0 
   	THEN DEV_STATUS = 'REGULATORY REJECTION';
     ELSE IF INDEX(EVENTFIELD,'APPROVAL REJECTED') GT 0 
   	THEN DEV_STATUS = 'REGULATORY REJECTION';
     ELSE IF INDEX(EVENTFIELD,'NON-APPROVABLE') GT 0 
   	THEN DEV_STATUS = 'REGULATORY REJECTION';
     ELSE IF INDEX(EVENTFIELD,'NONAPPROVABLE') GT 0 
   	THEN DEV_STATUS = 'REGULATORY REJECTION';
     ELSE IF INDEX(EVENTFIELD,'FAILS TO RECOMMEND') GT 0 
   	THEN DEV_STATUS = 'REGULATORY REJECTION';
     ELSE IF INDEX(EVENTFIELD,'REJECTION') GT 0 
   	THEN DEV_STATUS = 'REGULATORY REJECTION';
     ELSE IF INDEX(EVENTFIELD,'REFUSE') GT 0 
   	& INDEX(EVENTFIELD,'APPROVAL') GT 0
   	THEN DEV_STATUS = 'REGULATORY REJECTION';
     ELSE IF INDEX(EVENTFIELD,'AGAINST') GT 0 
   	& INDEX(EVENTFIELD,'APPROVAL') GT 0
   	THEN DEV_STATUS = 'REGULATORY REJECTION';
   
     IF DEV_STATUS NOT IN('REGULATORY REJECTION','WITHDRAWN') THEN DO;
       IF (INDEX(EVENTFIELD,'APPROVED') GT 0 
   	| INDEX(EVENTFIELD,'APPROVAL') GT 0) 
   	& INDEXW(EVENTFIELD,'RECOMMENDED') LT 1 & INDEXW(EVENTFIELD,'PROTOCOL') LT 1 
   	& INDEX(EVENTFIELD,'DNA ADVISORY') LT 1 & INDEX(EVENTFIELD,'COMPASSIONATE') LT 1
   	THEN DEV_STATUS = 'APPROVED';

     * Best guess where event field is not specific;   
       IF DEV_STATUS EQ '' THEN DO;
          IF (INDEXW(EVENTFIELD,'IND') GT 0 | INDEXW(EVENTFIELD,'INDS') GT 0 ) 
           & (INDEX(EVENTFIELD,'REJECT') LT 1 & INDEX(EVENTFIELD,'WITHDRAWN') LT 1)
             THEN DEV_STATUS = 'IND';
          ELSE IF (INDEX(EVENTFIELD,'IND SUBMISSION') GT 0 
                 | INDEX(EVENTFIELD,'IND SUBMITTED') GT 0 
                 | INDEX(EVENTFIELD,'IND FILED') GT 0 
                 | INDEX(EVENTFIELD,'IND CLEARANCE') GT 0)
             THEN DEV_STATUS = 'IND';     
          ELSE IF INDEX(EVENTFIELD,'RESEARCH PROGRAM UNDER WAY') GT 0
        	THEN DEV_STATUS = 'PRECLINICAL';
          ELSE IF INDEX(EVENTFIELD,'RESEARCH PROJECT UNDER WAY') GT 0
        	THEN DEV_STATUS = 'PRECLINICAL';
          ELSE IF INDEX(EVENTFIELD,'RESEARCH PROGRAM SET UP') GT 0
        	THEN DEV_STATUS = 'PRECLINICAL';
          ELSE IF INDEX(EVENTFIELD,'RESEARCH UNDER WAY') GT 0
        	THEN DEV_STATUS = 'PRECLINICAL';
          ELSE IF INDEX(EVENTFIELD,'PRECLINICAL') GT 0 
      	  & INDEX(EVENTFIELD,'HALTED') LT 1 
        	THEN DEV_STATUS = 'PRECLINICAL';
          ELSE IF INDEX(EVENTFIELD,'PRECLINCIAL') GT 0 
      	  & INDEX(EVENTFIELD,'HALTED') LT 1 
        	THEN DEV_STATUS = 'PRECLINICAL';
          ELSE IF INDEX(EVENTFIELD,'PRECLINIAL') GT 0 
      	  & INDEX(EVENTFIELD,'HALTED') LT 1 
        	THEN DEV_STATUS = 'PRECLINICAL';
          ELSE IF INDEX(EVENTFIELD,'PRECLINCAL') GT 0 
      	  & INDEX(EVENTFIELD,'HALTED') LT 1 
        	THEN DEV_STATUS = 'PRECLINICAL';
          ELSE IF INDEX(EVENTFIELD,'PRE-CLINICAL') GT 0 
      	  & INDEX(EVENTFIELD,'HALTED') LT 1 
        	THEN DEV_STATUS = 'PRECLINICAL';        
          ELSE IF INDEX(EVENTFIELD,'PHASE III') GT 0 
   	  & INDEX(EVENTFIELD,'HALTED') LT 1 
        	THEN DEV_STATUS = 'PHASE III';
          ELSE IF INDEX(EVENTFIELD,'PHASE II') GT 0 
      	  & INDEX(EVENTFIELD,'HALTED') LT 1 
        	THEN DEV_STATUS = 'PHASE II';     
          ELSE IF INDEX(EVENTFIELD,'PHASE I') GT 0 
      	  & INDEX(EVENTFIELD,'HALTED') LT 1 
        	THEN DEV_STATUS = 'PHASE I';        
          ELSE IF INDEX(EVENTFIELD,'DISCONTINUE') GT 0 
   	  & INDEX(EVENTFIELD,'LICENSE') LT 1 
        	THEN DEV_STATUS = 'DISCONTINUED';
          ELSE IF INDEX(EVENTFIELD,'DICONTINUE') GT 0 
      	  & INDEX(EVENTFIELD,'LICENSE') LT 1 
        	THEN DEV_STATUS = 'DISCONTINUED';
        
          ELSE IF INDEX(EVENTFIELD,'SUSPENDED') GT 0 
      	  & INDEX(EVENTFIELD,'LICENSE') LT 1 
        	THEN DEV_STATUS = 'SUSPENDED/HALTED';
          ELSE IF INDEX(EVENTFIELD,'HALTED') GT 0 
      	  & INDEX(EVENTFIELD,'LICENSE') LT 1 
        	THEN DEV_STATUS = 'SUSPENDED/HALTED';
          ELSE IF INDEX(EVENTFIELD,'NOT IN ACTIVE DEVELOPMENT') GT 0
        	THEN DEV_STATUS = 'SUSPENDED/HALTED';
          ELSE IF INDEX(EVENTFIELD,'NO ACTIVE DEVELOPMENT') GT 0
        	THEN DEV_STATUS = 'SUSPENDED/HALTED';
          ELSE IF INDEX(EVENTFIELD,'NO FURTHER DEVELOPMENT') GT 0
        	THEN DEV_STATUS = 'SUSPENDED/HALTED';
          ELSE IF INDEX(EVENTFIELD,'NO LONGER IN DEVELOPMENT') GT 0
        	THEN DEV_STATUS = 'SUSPENDED/HALTED';
          ELSE IF INDEX(EVENTFIELD,'NOT IN DEVELOPMENT') GT 0
        	THEN DEV_STATUS = 'SUSPENDED/HALTED';
          ELSE IF INDEX(EVENTFIELD,'DEVELOPMENT') GT 0 
      	  & INDEX(EVENTFIELD,'CEASE') GT 0
        	THEN DEV_STATUS = 'SUSPENDED/HALTED';
          ELSE IF INDEX(EVENTFIELD,'NOT IN ACTIVE PIPELINE') GT 0
        	THEN DEV_STATUS = 'SUSPENDED/HALTED';
        
        
          ELSE IF INDEX(EVENTFIELD,'CLINICALS') GT 0 
      	  & INDEX(EVENTFIELD,'HALTED') LT 1 
        	THEN DEV_STATUS = 'CLINICAL TRIALS, PHASE UNKNOWN';
          ELSE IF INDEX(EVENTFIELD,'CLINICAL TRIALS') GT 0 
      	  & INDEX(EVENTFIELD,'HALTED') LT 1 
        	THEN DEV_STATUS = 'CLINICAL TRIALS, PHASE UNKNOWN';
        
          ELSE IF INDEX(EVENTFIELD,'KNOWN TO BE IN ACTIVE DEVELOPMENT') GT 0
        	THEN DEV_STATUS = 'DEVELOPMENT, STAGE UNKNOWN';
          ELSE IF INDEX(EVENTFIELD,'KNOWN TO BE IN DEVELOPMENT') GT 0
        	THEN DEV_STATUS = 'DEVELOPMENT, STAGE UNKNOWN';
          
          ELSE IF INDEX(EVENTFIELD,'PATENT') GT 0 
      	  & INDEX(EVENTFIELD,'APPLICATION') GT 0
        	THEN DEV_STATUS = 'PATENT FILED';
          ELSE IF INDEX(EVENTFIELD,'PRIORITY') GT 0 
      	  & INDEX(EVENTFIELD,'APPLICATION') GT 0
        	THEN DEV_STATUS = 'PATENT FILED';
          ELSE IF INDEX(EVENTFIELD,'PATENT') GT 0 
      	  & INDEX(EVENTFIELD,'FILE') GT 0
        	THEN DEV_STATUS = 'PATENT FILED';
          ELSE IF INDEX(EVENTFIELD,'PATENT') GT 0 
      	  & INDEX(EVENTFIELD,'GRANTED') GT 0
        	THEN DEV_STATUS = 'PATENT GRANTED';
          ELSE IF INDEX(EVENTFIELD,'PATENT') GT 0 
      	  & INDEX(EVENTFIELD,'RECEIVE') GT 0
        	THEN DEV_STATUS = 'PATENT GRANTED';
          ELSE IF INDEX(EVENTFIELD,'PATENT') GT 0 
      	  & INDEX(EVENTFIELD,'ISSUE') GT 0
        	THEN DEV_STATUS = 'PATENT GRANTED';
     
          ELSE IF INDEX(EVENTFIELD,'PRE-REGISTRATION') GT 0 
        	THEN DEV_STATUS = 'PRE-REGISTRATION';
          ELSE IF INDEX(EVENTFIELD,'PE-REGISTRATION') GT 0 
        	THEN DEV_STATUS = 'PRE-REGISTRATION';
          ELSE IF INDEX(EVENTFIELD,'PRE-REGISTERED') GT 0 
        	THEN DEV_STATUS = 'PRE-REGISTRATION';
        
          ELSE IF INDEX(EVENTFIELD,'NDA FILED') GT 0 
        	THEN DEV_STATUS = 'PRE-REGISTRATION';
          ELSE IF INDEX(EVENTFIELD,'NDA SUBMITTED') GT 0 
        	THEN DEV_STATUS = 'PRE-REGISTRATION';
          ELSE IF INDEX(EVENTFIELD,'CTA FILED') GT 0 
        	THEN DEV_STATUS = 'PRE-REGISTRATION';
          ELSE IF INDEX(EVENTFIELD,'CTA SUBMITTED') GT 0 
        	THEN DEV_STATUS = 'PRE-REGISTRATION';
          ELSE IF INDEX(EVENTFIELD,'CTX FILED') GT 0 
        	THEN DEV_STATUS = 'PRE-REGISTRATION';
          ELSE IF INDEX(EVENTFIELD,'CTX SUBMITTED') GT 0 
        	THEN DEV_STATUS = 'PRE-REGISTRATION';
        
          ELSE IF INDEX(EVENTFIELD,'REGISTRATION') GT 0 
        	THEN DEV_STATUS = 'REGISTRATION';
          ELSE IF INDEX(EVENTFIELD,'REGISTERED') GT 0 
        	THEN DEV_STATUS = 'REGISTRATION';
        
        
          ELSE IF INDEX(EVENTFIELD,'APPROVAL RECOMMENDED') GT 0 
        	THEN DEV_STATUS = 'APPROVABLE';
          ELSE IF INDEX(EVENTFIELD,'RECOMMENDED FOR APPROVAL') GT 0 
        	THEN DEV_STATUS = 'APPROVABLE';
          ELSE IF INDEX(EVENTFIELD,'RECOMMEND FOR APPROVAL') GT 0 
        	THEN DEV_STATUS = 'APPROVABLE';
          ELSE IF INDEX(EVENTFIELD,'RECOMMENDED,') GT 0 
        	THEN DEV_STATUS = 'APPROVABLE';
          ELSE IF INDEX(EVENTFIELD,'APPROVABLE') GT 0 
        	THEN DEV_STATUS = 'APPROVABLE';
        
          ELSE IF INDEX(EVENTFIELD,'WITHDRAWN') GT 0 
        	THEN DEV_STATUS = 'WITHDRAWN';
      
          END; * Matches IF DEV_STATUS EQ '' line;
      
        END; * Matches IF DEV_STATUS NOT WITHDRAWN/REJECTION line;      
   
       IF DEV_STATUS EQ 'DISCOVERY' 
         & (EVENTYEAR LT YEAR_DISCOVERY | YEAR_DISCOVERY EQ .) THEN DO;
    	YEAR_DISCOVERY = EVENTYEAR;
   	QTR_DISCOVERY = EVENTQTR;
   	MONTH_DISCOVERY = EVENTMONTH;
       END;
       IF DEV_STATUS EQ 'PATENT FILED' 
   	| DEV_STATUS EQ 'PATENT_GRANTED' 
   	& (EVENTYEAR LT YEAR_PATENT | YEAR_PATENT EQ .) THEN DO;
    	YEAR_PATENT = EVENTYEAR;
   	QTR_PATENT = EVENTQTR;
   	MONTH_PATENT = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'APPROVABLE'  
   	& (EVENTYEAR LT YEAR_APPROVABLE | YEAR_APPROVABLE EQ .) THEN DO;
   	YEAR_APPROVABLE = EVENTYEAR;
   	QTR_APPROVABLE = EVENTQTR;
   	MONTH_APPROVABLE = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'APPROVED' 
   	& (EVENTYEAR LT YEAR_APPROVED | YEAR_APPROVED EQ .) THEN DO;
   	YEAR_APPROVED = EVENTYEAR;
   	QTR_APPROVED = EVENTQTR;
   	MONTH_APPROVED = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'DISCONTINUED'  
   	& (EVENTYEAR LT YEAR_DISCONTINUED | YEAR_DISCONTINUED EQ .) THEN DO;
   	YEAR_DISCONTINUED = EVENTYEAR;
   	QTR_DISCONTINUED = EVENTQTR;
   	MONTH_DISCONTINUED = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'IND' 
   	& (EVENTYEAR LT YEAR_IND | YEAR_IND EQ .) THEN DO;
   	YEAR_IND = EVENTYEAR;
   	QTR_IND = EVENTQTR;
   	MONTH_IND = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'LAUNCHED'  
   	& (EVENTYEAR LT YEAR_LAUNCHED | YEAR_LAUNCHED EQ .) THEN DO;
   	YEAR_LAUNCHED = EVENTYEAR;
   	QTR_LAUNCHED = EVENTQTR;
   	MONTH_LAUNCHED = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'PHASE I'  
   	& (EVENTYEAR LT YEAR_PHASE1 | YEAR_PHASE1 EQ .) THEN DO;
   	YEAR_PHASE1 = EVENTYEAR;
   	QTR_PHASE1 = EVENTQTR;
   	MONTH_PHASE1 = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'PHASE II'  
   	& (EVENTYEAR LT YEAR_PHASE2 | YEAR_PHASE2 EQ .) THEN DO;
   	YEAR_PHASE2 = EVENTYEAR;
   	QTR_PHASE2 = EVENTQTR;
   	MONTH_PHASE2 = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'PHASE III'  
   	& (EVENTYEAR LT YEAR_PHASE3 | YEAR_PHASE3 EQ .) THEN DO;
   	YEAR_PHASE3 = EVENTYEAR;
   	QTR_PHASE3 = EVENTQTR;
   	MONTH_PHASE3 = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'PRE-REGISTRATION'  
   	& (EVENTYEAR LT YEAR_PREREG | YEAR_PREREG EQ .) THEN DO;
   	YEAR_PREREG = EVENTYEAR;
   	QTR_PREREG = EVENTQTR;
   	MONTH_PREREG = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'PRECLINICAL'  
   	& (EVENTYEAR LT YEAR_PRECLINICAL | YEAR_PRECLINICAL EQ .) THEN DO;
   	YEAR_PRECLINICAL = EVENTYEAR;
   	QTR_PRECLINICAL = EVENTQTR;
   	MONTH_PRECLINICAL = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'REGISTRATION'  
   	& (EVENTYEAR LT YEAR_REGISTRATION | YEAR_REGISTRATION EQ .) THEN DO;
   	YEAR_REGISTRATION = EVENTYEAR;
   	QTR_REGISTRATION = EVENTQTR;
   	MONTH_REGISTRATION = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'REGULATORY REJECTION'  
   	& (EVENTYEAR LT YEAR_REJECTION | YEAR_REJECTION EQ .) THEN DO;
   	YEAR_REJECTION = EVENTYEAR;
   	QTR_REJECTION = EVENTQTR;
   	MONTH_REJECTION = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'SUSPENDED/HALTED'  
   	& (EVENTYEAR LT YEAR_HALTED | YEAR_HALTED EQ .) THEN DO;
   	YEAR_HALTED = EVENTYEAR;
   	QTR_HALTED = EVENTQTR;
   	MONTH_HALTED = EVENTMONTH;
       END;
       ELSE IF DEV_STATUS EQ 'WITHDRAWN'  
   	& (EVENTYEAR LT YEAR_WITHDRAWN | YEAR_WITHDRAWN EQ .) THEN DO;
   	YEAR_WITHDRAWN = EVENTYEAR;
   	QTR_WITHDRAWN = EVENTQTR;
   	MONTH_WITHDRAWN = EVENTMONTH;
       END;

     * See if status can be determined from other fields.;
       IF DEV_STATUS EQ '' THEN DO;
          IF INDEXW(UPCASE(DEV_SUMMARY),'PHASE') THEN PUT PROJECT_ID= PREFERRED_NAME= CLASS_CODE1= DEV_SUMMARY=;
          ELSE IF INDEXW(UPCASE(SCIENCE_SUMMARY),'PHASE') THEN PUT PROJECT_ID= PREFERRED_NAME= CLASS_CODE1= SCIENCE_SUMMARY=;
       END;
   
       DEV_STATUS='';
       NAME_STATUS='';
       LICENSE_STATUS='';
       OTHER_STATUS='';
   
      END;  * Matches DO loop over events;
   
  * Now assign disease codes and output one obs per indication per project;    
    LENGTH WHO_CODE $4;
    ARRAY INDS{*} INDICATION1-INDICATION22;
    DO I=1 TO MIN(NUMBER_INDICATION,22);
        WHO_CODE = '';
        INDICATION = INDS{I};
        * This code links indications to ICD10 codes from WHO;
        %INCLUDE 'disease_assignment_code';
        * This defines neglected diseases;
        NEGLECTED = WHO_CODE IN('1020','1021','1005','1006','1074','1059','1004','1002','1023','1022','1025','1017','1065');
        * This definition excludes HIV;
        NEGLECTED_NONHIV = WHO_CODE IN('1021','1005','1006','1074','1059','1004','1002','1023','1022','1025','1017','1065');
	OUTPUT;
    END;


    PROC MEANS;
        VAR YEAR_PHASE1 YEAR_PHASE2 YEAR_PHASE3 YEAR_APPROVED YEAR_LAUNCHED NUMBER_FIRMS NUMBER_FRANCHISE;
    PROC FREQ;
        TABLES WHO_CODE;
        TABLES NEGLECTED;
        TABLES NEGLECTED_NONHIV;
    
* Create independent variable: number of products approved before 1990;
   PROC SORT DATA=HERE.PROJECTS (WHERE=(MAX(YEAR_LAUNCHED,YEAR_APPROVED) LE 1990 AND MAX(YEAR_LAUNCHED,YEAR_APPROVED) NE . AND WHO_CODE NE '')) OUT=APPROVED;
       BY WHO_CODE;
   PROC MEANS DATA=APPROVED NOPRINT;
       BY WHO_CODE;
       VAR NEGLECTED;
       OUTPUT OUT=APPROVED N=TREATMENTS1990;
       
* Create dependent variables: number of starts in Phase 1-Phase 3;        
   PROC SORT DATA=HERE.PROJECTS (WHERE=(YEAR_PHASE1 NE . AND WHO_CODE NE '')) OUT=PHASE1 (RENAME=(YEAR_PHASE1=YEAR));
       BY WHO_CODE YEAR_PHASE1;
   PROC MEANS DATA=PHASE1 NOPRINT;
       BY WHO_CODE YEAR;
       ID NEGLECTED NEGLECTED_NONHIV;
       VAR NEGLECTED;
       OUTPUT OUT=PHASE1_STARTS N=NEW_PHASE1;
       
   PROC SORT DATA=HERE.PROJECTS (WHERE=(YEAR_PHASE2 NE . AND WHO_CODE NE '')) OUT=PHASE2 (RENAME=(YEAR_PHASE2=YEAR));
       BY WHO_CODE YEAR_PHASE2;
   PROC MEANS DATA=PHASE2 NOPRINT;
       BY WHO_CODE YEAR;
       ID NEGLECTED NEGLECTED_NONHIV;
       VAR NEGLECTED;
       OUTPUT OUT=PHASE2_STARTS N=NEW_PHASE2;
       
   PROC SORT DATA=HERE.PROJECTS (WHERE=(YEAR_PHASE3 NE . AND WHO_CODE NE '')) OUT=PHASE3 (RENAME=(YEAR_PHASE3=YEAR));
       BY WHO_CODE YEAR_PHASE3;
   PROC MEANS DATA=PHASE3 NOPRINT;
       BY WHO_CODE YEAR;
       ID NEGLECTED NEGLECTED_NONHIV;
       VAR NEGLECTED;
       OUTPUT OUT=PHASE3_STARTS N=NEW_PHASE3;
       
   PROC SORT DATA=PHASE1_STARTS;
       BY WHO_CODE YEAR;
   PROC SORT DATA=PHASE2_STARTS;
       BY WHO_CODE YEAR;
   PROC SORT DATA=PHASE3_STARTS;
       BY WHO_CODE YEAR;

   DATA HERE.DISEASE_STARTS;
       MERGE PHASE1_STARTS PHASE2_STARTS PHASE3_STARTS;
       BY WHO_CODE YEAR;       
   PROC SORT;
       BY WHO_CODE;
   DATA HERE.DISEASE_STARTS (DROP=_TYPE_ _FREQ_);
       MERGE HERE.DISEASE_STARTS APPROVED;
       BY WHO_CODE;       
       IF YEAR GE 1990 AND YEAR LE 2007;
   PROC MEANS;

ENDSAS;
    
